library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggExtra) ; library(ggpubr)
library(biomaRt) ; library(DESeq2) ; library(sva) ; library(WGCNA) ; library(vsn)
library(dendextend)
library(expss)
library(knitr)
Dataset downloaded from Arkinglab website in the Transcriptome analysis reveals dysregulation of innate immune response genes and neuronal activity-dependent genes in autism section.
# Load csvs
datExpr = read.delim('./../Data/datExpr.csv')
datMeta = read.delim('./../Data/datPheno.csv')
# Create dataset with gene information
datGenes = data.frame('Ensembl_ID' = substr(datExpr$Gene, 1, 15),
'gene_name' = substring(datExpr$Gene, 17))
rownames(datExpr) = datGenes$Ensembl_ID
datExpr$Gene = NULL
### CLEAN METADATA DATA FRAME
datMeta = datMeta %>% dplyr::select('ID', 'case', 'sampleid', 'brainregion', 'positiononplate',
'Gender', 'Age', 'SiteHM', 'RIN', 'PMI', 'Dx')
datMeta$brainregion = substr(datMeta$ID, 1, 4)
datMeta = datMeta %>% mutate(brain_lobe = ifelse(brainregion=='ba19', 'Occipital', 'Frontal'),
Diagnosis = ifelse(Dx=='Autism', 'ASD', 'CTL'))
# Convert Diagnosis variable to factor
datMeta$Diagnosis = factor(datMeta$Diagnosis, levels=c('CTL','ASD'))
# sampleid is actually subject ID
datMeta = datMeta %>% dplyr::rename(Subject_ID = sampleid)
# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_08-29-2019_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
# NCBI biotype annotation
NCBI_biotype = read.csv('./../../../NCBI/Data/gene_biotype_info.csv') %>%
rename(Ensembl_gene_identifier='ensembl_gene_id', type_of_gene='gene_biotype', Symbol='hgnc_symbol') %>%
mutate(gene_biotype = ifelse(gene_biotype=='protein-coding','protein_coding',gene_biotype))
rm(GO_annotations)
Data description taken from the paper Transcriptome analysis reveals dysregulation of innate immune response genes and neuronal activity-dependent genes in autism:
Transcriptomes from 104 human brain cortical tissue samples were resolved using next-generation RNA sequencing technology at single-gene resolution and through co-expressing gene clusters or modules. Multiple cortical tissues corresponding to Brodmann Area 19 (BA19), Brodmann Area 10 (BA10) and Brodmann Area 44 (BA44) were sequenced in 62, 14 and 28 samples, respectively, resulting in a total of 57 (40 unique individuals) control and 47 (32 unique individuals) autism samples.
Note: They analysed all of the regions together
Brain tissue: Frozen brain samples were acquired through the Autism Tissue Program, with samples originating from two different sites: the Harvard Brain Tissue Resource Center and the NICHD Brain and Tissue Bank at the University of Maryland (Gandal’s data were obtained also from the Autism Tissue Program, specifically from the Harvard Brain Bank)
Sequenced using Illumina’s HiSeq 2000 (Gandal used Illumina HiSeq 2500, they are compatible)
print(paste0('Dataset includes ', nrow(datExpr), ' genes from ', ncol(datExpr), ' samples belonging to ', length(unique(datMeta$Subject_ID)), ' different subjects.'))
## [1] "Dataset includes 62069 genes from 120 samples belonging to 72 different subjects."
In the paper they talk about an original number of 110 samples and dropping 6 because of low gene coverage, resulting in 104 samples (which are the ones that are included in datMeta), but the expression dataset has 120 samples.
no_metadata_samples = colnames(datExpr)[! colnames(datExpr) %in% datMeta$ID]
no_metadata_subjects = unique(substring(no_metadata_samples, 6))
cat(paste0('Samples without metadata: ', paste(no_metadata_samples, collapse=', '), '\n\n'))
## Samples without metadata: ba10.s11, ba10.s12, ba10.s21, ba10.s24, ba10.s87, ba19.s13, ba19.s21, ba19.s54, ba19.s60, ba19.s87, ba44.s12, ba44.s21, ba44.s23, ba44.s24, ba44.s77, ba44.s87
cat(paste0('Samples without metadata but with subject ID in datMeta: ',
paste(no_metadata_subjects[no_metadata_subjects %in% datMeta$Subject_ID], collapse=', ')))
## Samples without metadata but with subject ID in datMeta: s11, s13, s60, s23
Since we need the metadata of the samples, I’m going to add the metadata of the samples that share a subject ID with some sample with metadata
add_metadata_subjects = no_metadata_subjects[no_metadata_subjects %in% datMeta$Subject_ID]
add_metadata_samples = no_metadata_samples[grepl(paste(add_metadata_subjects, collapse='|'),
no_metadata_samples)]
for(sample in add_metadata_samples){
new_row = datMeta %>% filter(Subject_ID == strsplit(sample,'\\.')[[1]][2]) %>% dplyr::slice(1) %>%
mutate(ID = sample,
brainregion = strsplit(sample,'\\.')[[1]][1],
brain_lobe = ifelse(strsplit(sample,'\\.')[[1]][1]=='ba19','Occipital','Frontal'))
datMeta = rbind(datMeta, new_row)
}
cat(paste0('Number of samples: ', nrow(datMeta)))
## Number of samples: 108
rm(no_metadata_subjects, no_metadata_samples, add_metadata_subjects, add_metadata_samples, sample, new_row)
And remove the samples that have no metadata and don’t have any other samples that do have metadata.
keep = substring(colnames(datExpr), 6) %in% datMeta$Subject_ID
cat(paste0('Removing ', sum(!keep) ,' samples (', paste(colnames(datExpr)[!keep], collapse=', '), ')\n\n'))
## Removing 12 samples (ba10.s12, ba10.s21, ba10.s24, ba10.s87, ba19.s21, ba19.s54, ba19.s87, ba44.s12, ba44.s21, ba44.s24, ba44.s77, ba44.s87)
cat(paste0('Belonging to subjects with IDs ',
paste0(unique(substring(colnames(datExpr)[!keep],6)), collapse=', '), '\n'))
## Belonging to subjects with IDs s12, s21, s24, s87, s54, s77
datExpr = datExpr[,keep]
# Match order of datExpr columns and datMeta rows
datMeta = datMeta[match(colnames(datExpr), datMeta$ID),]
# Check they are in the same order
if(!all(colnames(datExpr) == datMeta$ID)){
cat('\norder of samples don\'t match between datExpr and datMeta!\n')
}
cat(paste0('Removed ', sum(!keep), ' samples, ', sum(keep), ' remaining'))
## Removed 12 samples, 108 remaining
rm(keep)
Counts distribution: More than half of the counts are zero and most of the counts are relatively low, but there are some very high outliers (although the highest value is one order of magnitude lower than the maximum in the Gandal dataset)
count_distr = summary(melt(datExpr))[,2]
for(i in 1:6){
print(count_distr[i])
}
##
## "Min. : 0 "
##
## "1st Qu.: 0 "
##
## "Median : 0 "
##
## "Mean : 251 "
##
## "3rd Qu.: 8 "
##
## "Max. :7718873 "
rm(i, count_distr)
Diagnosis distribution: There are more CTL samples than controls, but it’s relatively balanced
table_info = datMeta %>% expss::apply_labels(Diagnosis = 'Diagnosis', brain_lobe = 'Brain Lobe', brainregion = 'Brain Region',
SiteHM = 'Batch', Gender = 'Gender')
cat('By Sample:')
## By Sample:
cro(table_info$Diagnosis)
| #Total | |
|---|---|
| Diagnosis | |
| CTL | 58 |
| ASD | 50 |
| #Total cases | 108 |
cat('By Subject:')
## By Subject:
cro(table_info$Diagnosis[!duplicated(table_info$Subject_ID)])
| #Total | |
|---|---|
| Diagnosis | |
| CTL | 40 |
| ASD | 32 |
| #Total cases | 72 |
Brain region distribution: The Occipital lobe has more samples than the Frontal lobe, even though we are combining two brain regions in the Frontal Lobe
cro(table_info$brainregion)
| #Total | |
|---|---|
| Brain Region | |
| ba10 | 15 |
| ba19 | 64 |
| ba44 | 29 |
| #Total cases | 108 |
cro(table_info$brain_lobe)
| #Total | |
|---|---|
| Brain Lobe | |
| Frontal | 44 |
| Occipital | 64 |
| #Total cases | 108 |
Most of the Control samples are from the Occipital lobe, the Autism samples are balanced between both lobes. This may cause problems because Ctl and Occipital are related
cro(table_info$Diagnosis, list(table_info$brain_lobe, total()))
| Brain Lobe | #Total | |||
|---|---|---|---|---|
| Frontal | Occipital | |||
| Diagnosis | ||||
| CTL | 19 | 39 | 58 | |
| ASD | 25 | 25 | 50 | |
| #Total cases | 44 | 64 | 108 | |
cat(paste0(round(100*sum(datMeta$Diagnosis=='CTL' & datMeta$brain_lobe=='Occipital')/sum(datMeta$brain_lobe=='Occipital')),
'% of the samples in the Occipital lobe are Control samples\n'))
## 61% of the samples in the Occipital lobe are Control samples
Gender distribution: There are thrice as many Male samples than Female ones
cro(table_info$Gender)
| #Total | |
|---|---|
| Gender | |
| F | 26 |
| M | 82 |
| #Total cases | 108 |
There is a small imbalance between gender and diagnosis with more males in the control group than in the autism group
cro(table_info$Diagnosis, list(table_info$Gender,total()))
| Gender | #Total | |||
|---|---|---|---|---|
| F | M | |||
| Diagnosis | ||||
| CTL | 12 | 46 | 58 | |
| ASD | 14 | 36 | 50 | |
| #Total cases | 26 | 82 | 108 | |
cat(paste0('\n',round(100*sum(datMeta$Diagnosis=='CTL' & datMeta$Gender=='M')/sum(datMeta$Diagnosis=='CTL')),
'% of the Control samples are Male\n'))
##
## 79% of the Control samples are Male
cat(paste0(round(100*sum(datMeta$Diagnosis=='ASD' & datMeta$Gender=='M')/sum(datMeta$Diagnosis=='ASD')),
'% of the Autism samples are Male'))
## 72% of the Autism samples are Male
Age distribution: Subjects between 2 and 82 years old with a mean close to 20
Control samples are less evenly distributed across ages than Autism samples
summary(datMeta$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 8.75 18.00 20.49 22.00 82.00
datMeta_by_subject = datMeta %>% filter(!duplicated(Subject_ID))
datMeta_by_subject %>% ggplot(aes(Age)) +
geom_density(alpha=0.5, aes(group=Diagnosis, fill=Diagnosis), color='transparent') +
geom_density(alpha=0.5, fill='gray', color='transparent') +
theme_minimal()
rm(datMeta_by_subject, table_info)
I was originally running this with the feb2014 version of BioMart because that’s the one that Gandal used (and it finds all of the Ensembl IDs, which other versions don’t), but it has some outdated biotype annotations, to fix this I’ll obtain all the information except the biotype label from BioMart in the same way as it had been done before, and then I’ll add the most current biotype label using information from NCBI’s website and then from BioMart in the following way:
1.Use BioMart to run a query with the original feb2014 version using the Ensembl IDs as keys to obtain all the information except the biotype labels
2.1 Use the NCBI annotations downloaded from NCBI’s website and processed in NCBI/RMarkdowns/20_02_07_clean_data.html (there is information for only 26K genes, so some genes will remain unlabelled)
2.2 Use the current version (jan2020) to obtain the biotype annotations using the Ensembl ID as keys (some genes don’t return a match)
2.3 For the genes that didn’t return a match, use the current version (jan2020) to obtain the biotype annotations using the gene name as keys (17 genes return multiple labels)
2.4 For the genes that returned multiple labels, use the feb2014 version with the Ensembl IDs as keys
labels_source = data.frame(data.frame('source' = c('NCBI', 'BioMart2020_byID', 'BioMart2020_byGene', 'BioMart2014'),
'n_matches' = rep(0,4)))
########################################################################################
# 1. Query archive version
getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
'end_position','strand')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl', host='feb2014.archive.ensembl.org')
datGenes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart) %>%
rename(external_gene_id = 'hgnc_symbol')
## Batch submitting query [-------------------------------] 2% eta: 1m
## Batch submitting query [>------------------------------] 2% eta: 1m
## Batch submitting query [>------------------------------] 3% eta: 1m
## Batch submitting query [>------------------------------] 4% eta: 1m
## Batch submitting query [>------------------------------] 5% eta: 1m
## Batch submitting query [=>-----------------------------] 6% eta: 1m
## Batch submitting query [=>-----------------------------] 7% eta: 1m Batch
## submitting query [=>-----------------------------] 8% eta: 1m Batch submitting
## query [==>----------------------------] 9% eta: 1m Batch submitting query
## [==>----------------------------] 10% eta: 1m Batch submitting query
## [==>----------------------------] 11% eta: 1m Batch submitting query
## [===>---------------------------] 12% eta: 1m Batch submitting query
## [===>---------------------------] 13% eta: 49s Batch submitting query
## [===>---------------------------] 14% eta: 47s Batch submitting query
## [===>---------------------------] 14% eta: 45s Batch submitting query
## [====>--------------------------] 15% eta: 44s Batch submitting query
## [====>--------------------------] 16% eta: 42s Batch submitting query
## [====>--------------------------] 17% eta: 41s Batch submitting query
## [====>--------------------------] 18% eta: 40s Batch submitting query
## [=====>-------------------------] 18% eta: 39s Batch submitting query
## [=====>-------------------------] 19% eta: 38s Batch submitting query
## [=====>-------------------------] 20% eta: 37s Batch submitting query
## [=====>-------------------------] 21% eta: 36s Batch submitting query
## [======>------------------------] 22% eta: 35s Batch submitting query
## [======>------------------------] 22% eta: 34s Batch submitting query
## [======>------------------------] 23% eta: 33s Batch submitting query
## [======>------------------------] 24% eta: 32s Batch submitting query
## [=======>-----------------------] 25% eta: 32s Batch submitting query
## [=======>-----------------------] 26% eta: 31s Batch submitting query
## [=======>-----------------------] 26% eta: 30s Batch submitting query
## [=======>-----------------------] 27% eta: 30s Batch submitting query
## [========>----------------------] 28% eta: 29s Batch submitting query
## [========>----------------------] 29% eta: 28s Batch submitting query
## [========>----------------------] 30% eta: 28s Batch submitting query
## [========>----------------------] 30% eta: 27s Batch submitting query
## [=========>---------------------] 31% eta: 27s Batch submitting query
## [=========>---------------------] 32% eta: 26s Batch submitting query
## [=========>---------------------] 33% eta: 26s Batch submitting query
## [=========>---------------------] 34% eta: 25s Batch submitting query
## [==========>--------------------] 34% eta: 25s Batch submitting query
## [==========>--------------------] 35% eta: 24s Batch submitting query
## [==========>--------------------] 36% eta: 24s Batch submitting query
## [==========>--------------------] 37% eta: 23s Batch submitting query
## [===========>-------------------] 38% eta: 23s Batch submitting query
## [===========>-------------------] 39% eta: 22s Batch submitting query
## [===========>-------------------] 40% eta: 22s Batch submitting query
## [============>------------------] 41% eta: 21s Batch submitting query
## [============>------------------] 42% eta: 21s Batch submitting query
## [============>------------------] 42% eta: 20s Batch submitting query
## [============>------------------] 43% eta: 20s Batch submitting query
## [=============>-----------------] 44% eta: 20s Batch submitting query
## [=============>-----------------] 45% eta: 19s Batch submitting query
## [=============>-----------------] 46% eta: 19s Batch submitting query
## [==============>----------------] 47% eta: 18s Batch submitting query
## [==============>----------------] 48% eta: 18s Batch submitting query
## [==============>----------------] 49% eta: 18s Batch submitting query
## [==============>----------------] 50% eta: 17s Batch submitting query
## [===============>---------------] 50% eta: 17s Batch submitting query
## [===============>---------------] 51% eta: 17s Batch submitting query
## [===============>---------------] 52% eta: 16s Batch submitting query
## [===============>---------------] 53% eta: 16s Batch submitting query
## [================>--------------] 54% eta: 16s Batch submitting query
## [================>--------------] 54% eta: 15s Batch submitting query
## [================>--------------] 55% eta: 15s Batch submitting query
## [================>--------------] 56% eta: 15s Batch submitting query
## [=================>-------------] 57% eta: 14s Batch submitting query
## [=================>-------------] 58% eta: 14s Batch submitting query
## [=================>-------------] 59% eta: 14s Batch submitting query
## [==================>------------] 60% eta: 13s Batch submitting query
## [==================>------------] 61% eta: 13s Batch submitting query
## [==================>------------] 62% eta: 13s Batch submitting query
## [==================>------------] 62% eta: 12s Batch submitting query
## [===================>-----------] 63% eta: 12s Batch submitting query
## [===================>-----------] 64% eta: 12s Batch submitting query
## [===================>-----------] 65% eta: 11s Batch submitting query
## [===================>-----------] 66% eta: 11s Batch submitting query
## [====================>----------] 66% eta: 11s Batch submitting query
## [====================>----------] 67% eta: 11s Batch submitting query
## [====================>----------] 68% eta: 10s Batch submitting query
## [====================>----------] 69% eta: 10s Batch submitting query
## [=====================>---------] 70% eta: 10s Batch submitting query
## [=====================>---------] 71% eta: 9s Batch submitting query
## [=====================>---------] 72% eta: 9s Batch submitting query
## [======================>--------] 73% eta: 9s Batch submitting query
## [======================>--------] 74% eta: 8s Batch submitting query
## [======================>--------] 75% eta: 8s Batch submitting query
## [=======================>-------] 76% eta: 8s Batch submitting query
## [=======================>-------] 77% eta: 7s Batch submitting query
## [=======================>-------] 78% eta: 7s Batch submitting query
## [========================>------] 79% eta: 7s Batch submitting query
## [========================>------] 80% eta: 6s Batch submitting query
## [========================>------] 81% eta: 6s Batch submitting query
## [========================>------] 82% eta: 6s Batch submitting query
## [=========================>-----] 82% eta: 6s Batch submitting query
## [=========================>-----] 83% eta: 5s Batch submitting query
## [=========================>-----] 84% eta: 5s Batch submitting query
## [=========================>-----] 85% eta: 5s Batch submitting query
## [==========================>----] 86% eta: 5s Batch submitting query
## [==========================>----] 86% eta: 4s Batch submitting query
## [==========================>----] 87% eta: 4s Batch submitting query
## [==========================>----] 88% eta: 4s Batch submitting query
## [===========================>---] 89% eta: 3s Batch submitting query
## [===========================>---] 90% eta: 3s Batch submitting query
## [===========================>---] 91% eta: 3s Batch submitting query
## [============================>--] 92% eta: 2s Batch submitting query
## [============================>--] 93% eta: 2s Batch submitting query
## [============================>--] 94% eta: 2s Batch submitting query
## [=============================>-] 95% eta: 1s Batch submitting query
## [=============================>-] 96% eta: 1s Batch submitting query
## [=============================>-] 97% eta: 1s Batch submitting query
## [=============================>-] 98% eta: 1s Batch submitting query
## [==============================>] 98% eta: 0s Batch submitting query
## [==============================>] 99% eta: 0s Batch submitting query
## [===============================] 100% eta: 0s
datGenes$length = datGenes$end_position-datGenes$start_position
cat(paste0('1. ', nrow(datGenes), '/', nrow(datExpr),
' Ensembl IDs weren\'t found in the feb2014 version of BioMart'))
## 1. 60489/62069 Ensembl IDs weren't found in the feb2014 version of BioMart
########################################################################################
########################################################################################
# 2. Get Biotype Labels
cat('2. Add biotype information')
## 2. Add biotype information
########################################################################################
# 2.1 Add NCBI annotations
datGenes = datGenes %>% left_join(NCBI_biotype, by=c('ensembl_gene_id','hgnc_symbol'))
cat(paste0('2.1 ' , sum(is.na(datGenes$gene_biotype)), '/', nrow(datGenes),
' Ensembl IDs weren\'t found in the NCBI database'))
## 2.1 39797/60489 Ensembl IDs weren't found in the NCBI database
labels_source$n_matches[1] = sum(!is.na(datGenes$gene_biotype))
########################################################################################
# 2.2 Query current BioMart version for gene_biotype using Ensembl ID as key
getinfo = c('ensembl_gene_id','gene_biotype')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl', host='jan2020.archive.ensembl.org')
datGenes_biotype = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), mart=mart,
values=datGenes$ensembl_gene_id[is.na(datGenes$gene_biotype)])
## Batch submitting query [>------------------------------] 2% eta: 1m
## Batch submitting query [>------------------------------] 4% eta: 2m
## Batch submitting query [=>-----------------------------] 5% eta: 2m
## Batch submitting query [=>-----------------------------] 6% eta: 1m Batch
## submitting query [=>-----------------------------] 8% eta: 2m Batch submitting
## query [==>----------------------------] 9% eta: 2m Batch submitting query
## [==>----------------------------] 10% eta: 2m Batch submitting query
## [==>----------------------------] 11% eta: 1m Batch submitting query
## [===>---------------------------] 12% eta: 2m Batch submitting query
## [===>---------------------------] 14% eta: 2m Batch submitting query
## [====>--------------------------] 15% eta: 2m Batch submitting query
## [====>--------------------------] 16% eta: 2m Batch submitting query
## [====>--------------------------] 18% eta: 2m Batch submitting query
## [=====>-------------------------] 19% eta: 2m Batch submitting query
## [=====>-------------------------] 20% eta: 2m Batch submitting query
## [======>------------------------] 21% eta: 2m Batch submitting query
## [======>------------------------] 22% eta: 2m Batch submitting query
## [======>------------------------] 24% eta: 2m Batch submitting query
## [=======>-----------------------] 25% eta: 2m Batch submitting query
## [=======>-----------------------] 26% eta: 2m Batch submitting query
## [========>----------------------] 28% eta: 2m Batch submitting query
## [========>----------------------] 29% eta: 2m Batch submitting query
## [========>----------------------] 30% eta: 2m Batch submitting query
## [=========>---------------------] 31% eta: 2m Batch submitting query
## [=========>---------------------] 32% eta: 2m Batch submitting query
## [=========>---------------------] 34% eta: 2m Batch submitting query
## [==========>--------------------] 35% eta: 2m Batch submitting query
## [==========>--------------------] 36% eta: 2m Batch submitting query
## [===========>-------------------] 38% eta: 2m Batch submitting query
## [===========>-------------------] 39% eta: 2m Batch submitting query
## [===========>-------------------] 40% eta: 2m Batch submitting query
## [============>------------------] 41% eta: 2m Batch submitting query
## [============>------------------] 42% eta: 2m Batch submitting query
## [=============>-----------------] 44% eta: 2m Batch submitting query
## [=============>-----------------] 45% eta: 2m Batch submitting query
## [=============>-----------------] 46% eta: 2m Batch submitting query
## [==============>----------------] 48% eta: 1m Batch submitting query
## [==============>----------------] 49% eta: 2m Batch submitting query
## [===============>---------------] 50% eta: 1m Batch submitting query
## [===============>---------------] 51% eta: 1m Batch submitting query
## [===============>---------------] 52% eta: 1m Batch submitting query
## [================>--------------] 54% eta: 1m Batch submitting query
## [================>--------------] 55% eta: 1m Batch submitting query
## [================>--------------] 56% eta: 1m Batch submitting query
## [=================>-------------] 57% eta: 1m Batch submitting query
## [=================>-------------] 59% eta: 1m Batch submitting query
## [==================>------------] 60% eta: 1m Batch submitting query
## [==================>------------] 61% eta: 1m Batch submitting query
## [==================>------------] 62% eta: 1m Batch submitting query
## [===================>-----------] 64% eta: 1m Batch submitting query
## [===================>-----------] 65% eta: 1m Batch submitting query
## [====================>----------] 66% eta: 1m Batch submitting query
## [====================>----------] 68% eta: 1m Batch submitting query
## [====================>----------] 69% eta: 1m Batch submitting query
## [=====================>---------] 70% eta: 1m Batch submitting query
## [=====================>---------] 71% eta: 1m Batch submitting query
## [=====================>---------] 72% eta: 47s Batch submitting query
## [======================>--------] 74% eta: 45s Batch submitting query
## [======================>--------] 75% eta: 42s Batch submitting query
## [=======================>-------] 76% eta: 39s Batch submitting query
## [=======================>-------] 78% eta: 37s Batch submitting query
## [=======================>-------] 79% eta: 35s Batch submitting query
## [========================>------] 80% eta: 32s Batch submitting query
## [========================>------] 81% eta: 32s Batch submitting query
## [=========================>-----] 82% eta: 29s Batch submitting query
## [=========================>-----] 84% eta: 28s Batch submitting query
## [=========================>-----] 85% eta: 25s Batch submitting query
## [==========================>----] 86% eta: 24s Batch submitting query
## [==========================>----] 88% eta: 22s Batch submitting query
## [===========================>---] 89% eta: 20s Batch submitting query
## [===========================>---] 90% eta: 18s Batch submitting query
## [===========================>---] 91% eta: 15s Batch submitting query
## [============================>--] 92% eta: 13s Batch submitting query
## [============================>--] 94% eta: 11s Batch submitting query
## [============================>--] 95% eta: 9s Batch submitting query
## [=============================>-] 96% eta: 7s Batch submitting query
## [=============================>-] 98% eta: 5s Batch submitting query
## [==============================>] 99% eta: 2s Batch submitting query
## [===============================] 100% eta: 0s
cat(paste0('2.2 ' , sum(is.na(datGenes$gene_biotype))-nrow(datGenes_biotype), '/', sum(is.na(datGenes$gene_biotype)),
' Ensembl IDs weren\'t found in the jan2020 version of BioMart when querying by Ensembl ID'))
## 2.2 8041/39797 Ensembl IDs weren't found in the jan2020 version of BioMart when querying by Ensembl ID
# Add new gene_biotype info to datGenes
datGenes = datGenes %>% left_join(datGenes_biotype, by='ensembl_gene_id') %>%
mutate(gene_biotype = coalesce(as.character(gene_biotype.x), gene_biotype.y)) %>%
dplyr::select(-gene_biotype.x, -gene_biotype.y)
labels_source$n_matches[2] = sum(!is.na(datGenes$gene_biotype)) - labels_source$n_matches[1]
########################################################################################
# 3. Query current BioMart version for gene_biotype using gene symbol as key
missing_genes = unique(datGenes$hgnc_symbol[is.na(datGenes$gene_biotype)])
getinfo = c('hgnc_symbol','gene_biotype')
datGenes_biotype_by_gene = getBM(attributes=getinfo, filters=c('hgnc_symbol'), mart=mart,
values=missing_genes)
## Batch submitting query [===>---------------------------] 13% eta: 3s
## Batch submitting query [=====>-------------------------] 20% eta: 13s
## Batch submitting query [=======>-----------------------] 27% eta: 10s
## Batch submitting query [=========>---------------------] 33% eta: 13s
## Batch submitting query [===========>-------------------] 40% eta: 14s
## Batch submitting query [=============>-----------------] 47% eta: 12s
## Batch submitting query [================>--------------] 53% eta: 13s
## Batch submitting query [==================>------------] 60% eta: 16s
## Batch submitting query [====================>----------] 67% eta: 12s
## Batch submitting query [======================>--------] 73% eta: 9s
## Batch submitting query [========================>------] 80% eta: 7s Batch
## submitting query [==========================>----] 87% eta: 5s Batch submitting
## query [============================>--] 93% eta: 3s Batch submitting query
## [===============================] 100% eta: 0s
cat(paste0('2.3 ', length(missing_genes)-length(unique(datGenes_biotype_by_gene$hgnc_symbol)),'/',length(missing_genes),
' genes weren\'t found in the current BioMart version when querying by gene name'))
## 2.3 5243/7038 genes weren't found in the current BioMart version when querying by gene name
dups = unique(datGenes_biotype_by_gene$hgnc_symbol[duplicated(datGenes_biotype_by_gene$hgnc_symbol)])
cat(paste0(' ', length(dups), ' genes returned multiple labels (these won\'t be added)'))
## 17 genes returned multiple labels (these won't be added)
# Update information
datGenes_biotype_by_gene = datGenes_biotype_by_gene %>% filter(!hgnc_symbol %in% dups)
datGenes = datGenes %>% left_join(datGenes_biotype_by_gene, by='hgnc_symbol') %>%
mutate(gene_biotype = coalesce(gene_biotype.x, gene_biotype.y)) %>%
dplyr::select(-gene_biotype.x, -gene_biotype.y)
labels_source$n_matches[3] = sum(!is.na(datGenes$gene_biotype)) - sum(labels_source$n_matches)
########################################################################################
# 4. Query feb2014 BioMart version for the missing biotypes
missing_ensembl_ids = unique(datGenes$ensembl_gene_id[is.na(datGenes$gene_biotype)])
getinfo = c('ensembl_gene_id','gene_biotype')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl', host='feb2014.archive.ensembl.org')
datGenes_biotype_archive = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=missing_ensembl_ids, mart=mart)
## Batch submitting query [====>--------------------------] 15% eta: 1s
## Batch submitting query [======>------------------------] 23% eta: 2s
## Batch submitting query [=========>---------------------] 31% eta: 2s
## Batch submitting query [===========>-------------------] 38% eta: 2s
## Batch submitting query [=============>-----------------] 46% eta: 1s
## Batch submitting query [================>--------------] 54% eta: 1s
## Batch submitting query [==================>------------] 62% eta: 1s
## Batch submitting query [====================>----------] 69% eta: 1s
## Batch submitting query [=======================>-------] 77% eta: 1s Batch
## submitting query [=========================>-----] 85% eta: 0s Batch submitting
## query [============================>--] 92% eta: 0s Batch submitting query
## [===============================] 100% eta: 0s
cat(paste0('2.4 ', length(missing_ensembl_ids)-nrow(datGenes_biotype_archive),'/',length(missing_ensembl_ids),
' genes weren\'t found in the feb2014 BioMart version when querying by Ensembl ID'))
## 2.4 0/6074 genes weren't found in the feb2014 BioMart version when querying by Ensembl ID
# Update information
datGenes = datGenes %>% left_join(datGenes_biotype_archive, by='ensembl_gene_id') %>%
mutate(gene_biotype = coalesce(gene_biotype.x, gene_biotype.y)) %>%
dplyr::select(-gene_biotype.x, -gene_biotype.y)
labels_source$n_matches[4] = sum(!is.na(datGenes$gene_biotype)) - sum(labels_source$n_matches)
########################################################################################
# Plot results
labels_source = labels_source %>% mutate(x = 1, percentage = round(100*n_matches/sum(n_matches),1))
p = labels_source %>% ggplot(aes(x, percentage, fill=source)) + geom_bar(position = 'stack', stat = 'identity') +
theme_minimal() + coord_flip() + theme(legend.position='bottom', axis.title.y=element_blank(),
axis.text.y=element_blank(), axis.ticks.y=element_blank())
ggplotly(p + theme(legend.position='none'))
as_ggplot(get_legend(p))
########################################################################################
# Reorder rows to match datExpr
datGenes = datGenes[match(rownames(datExpr), datGenes$ensembl_gene_id),]
rm(getinfo, mart, datGenes_biotype, datGenes_biotype_by_gene, datGenes_biotype_archive,
dups, missing_ensembl_ids, missing_genes, labels_source, p)
Checking how many SFARI genes are in the dataset
df = SFARI_genes %>% dplyr::select(-gene_biotype) %>% inner_join(datGenes, by=c('ID'='ensembl_gene_id'))
cat(paste0('Considering all genes, this dataset contains ', length(unique(df$`gene-symbol`)),
' of the ', length(unique(SFARI_genes$`gene-symbol`)), ' SFARI genes\n\n'))
## Considering all genes, this dataset contains 977 of the 980 SFARI genes
missing_genes = unique(SFARI_genes$`gene-symbol`[! SFARI_genes$`gene-symbol` %in% df$`gene-symbol`])
cat('The missing genes are:')
## The missing genes are:
for(mg in missing_genes){
cat(paste0(mg, ' with a SFARI score of ', SFARI_genes$`gene-score`[SFARI_genes$`gene-symbol`==mg][1], '\n'))
}
## GRIN2B with a SFARI score of 1
## MIR137 with a SFARI score of 3
## ZNF8 with a SFARI score of 5
rm(df, missing_genes, mg)
1. Filter entries that weren’t found in any of the NCBI annotations or biomart queries
to_keep = !is.na(datGenes$length)
#cat(paste0('Names of the rows removed: ', paste(rownames(datExpr)[!to_keep], collapse=', ')))
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datGenes) = datGenes$ensembl_gene_id
cat(paste0('Removed ', sum(!to_keep), ' genes, ', sum(to_keep), ' remaining'))
## Removed 1580 genes, 60489 remaining
2. Filter genes that do not encode any protein
cat(paste0(sum(datGenes$gene_biotype=='protein_coding'), '/', nrow(datGenes), ' are protein coding genes' ))
## 22105/60489 are protein coding genes
sort(table(datGenes$gene_biotype), decreasing=TRUE)
##
## protein_coding lncRNA
## 22105 11082
## processed_pseudogene unprocessed_pseudogene
## 9400 2479
## 1 miRNA
## 2278 2203
## misc_RNA snRNA
## 2155 2008
## pseudogene snoRNA
## 1289 1178
## lincRNA transcribed_unprocessed_pseudogene
## 656 647
## rRNA_pseudogene transcribed_processed_pseudogene
## 496 418
## 3 antisense
## 330 308
## 6 IG_V_pseudogene
## 305 168
## TR_V_gene IG_V_gene
## 146 134
## transcribed_unitary_pseudogene TR_J_gene
## 86 81
## unitary_pseudogene processed_transcript
## 74 56
## rRNA TR_V_pseudogene
## 47 46
## sense_intronic sense_overlapping
## 43 38
## scaRNA IG_D_gene
## 31 27
## polymorphic_pseudogene 7
## 27 25
## Mt_tRNA IG_J_gene
## 22 18
## 4 IG_C_gene
## 17 14
## TEC IG_C_pseudogene
## 11 8
## TR_C_gene ribozyme
## 8 5
## TR_J_pseudogene 3prime_overlapping_ncrna
## 5 4
## IG_J_pseudogene TR_D_gene
## 3 3
## Mt_rRNA 8
## 2 1
## translated_processed_pseudogene translated_unprocessed_pseudogene
## 1 1
Most of the genes with low expression levels are not protein-coding
plot_data = data.frame('ID' = rownames(datExpr), 'MeanExpr' = apply(datExpr, 1, mean),
'ProteinCoding' = datGenes$gene_biotype=='protein_coding')
ggplotly(plot_data %>% ggplot(aes(log2(MeanExpr+1), fill=ProteinCoding, color=ProteinCoding)) + geom_density(alpha=0.5) +
theme_minimal())
rm(plot_data)
Note: The gene name for Ensembl ID ENSG00000187951 is wrong, it should be AC091057.1 instead of ARHGAP11B, but the biotype is right, so it would still be filtered out
df = SFARI_genes %>% dplyr::select(-gene_biotype) %>% inner_join(datGenes, by=c('ID'='ensembl_gene_id'))
cat(paste0('Filtering protein coding genes, we are left with ', length(unique(df$`gene-symbol`[df$gene_biotype=='protein_coding'])),
' SFARI genes'))
## Filtering protein coding genes, we are left with 974 SFARI genes
kable(df %>% filter(! `gene-symbol` %in% df$`gene-symbol`[df$gene_biotype=='protein_coding']) %>%
dplyr::select(ID, `gene-symbol`, `gene-score`, gene_biotype, syndromic, `number-of-reports`), caption='Lost Genes')
| ID | gene-symbol | gene-score | gene_biotype | syndromic | number-of-reports |
|---|---|---|---|---|---|
| ENSG00000187951 | ARHGAP11B | 4 | lncRNA | 0 | 2 |
| ENSG00000251593 | MSNP1AS | 2 | processed_pseudogene | 0 | 12 |
| ENSG00000197558 | SSPO | 4 | transcribed_unitary_pseudogene | 0 | 3 |
rm(df)
if(!all(rownames(datExpr)==rownames(datGenes))) cat('!!! gene rownames do not match!!!')
to_keep = datGenes$gene_biotype=='protein_coding'
datExpr = datExpr %>% filter(to_keep)
datGenes = datGenes %>% filter(to_keep)
rownames(datExpr) = datGenes$ensembl_gene_id
rownames(datGenes) = datGenes$ensembl_gene_id
cat(paste0(length(unique(SFARI_genes$`gene-symbol`[SFARI_genes$ID %in% rownames(datExpr)])), ' SFARI genes remaining'))
## 974 SFARI genes remaining
cat(paste0('Removed ', sum(!to_keep), ' genes, ', sum(to_keep), ' remaining'))
## Removed 38384 genes, 22105 remaining
3. Filter genes with low expression levels
\(\qquad\) 3.1 Remove genes with zero expression in all of the samples
to_keep = rowSums(datExpr)>0
cat(paste0('Removed ', sum(!to_keep), ' genes, ', sum(to_keep), ' remaining'))
## Removed 2801 genes, 19304 remaining
# We are filtering out many SFARI genes with level of expression 0 but turns out most of them had another copy in the dataset
# in another row, so these genes are not actually lost
genes_with_copies = data.frame('rowSums' = rowSums(datExpr), 'ensembl_gene_id' = rownames(datExpr)) %>%
right_join(SFARI_genes, by='ensembl_gene_id') %>% filter(rowSums>0)
df = data.frame('rowSums' = rowSums(datExpr), 'ensembl_gene_id' = rownames(datExpr)) %>%
right_join(SFARI_genes, by='ensembl_gene_id') %>% filter(rowSums==0 & !is.na(`gene-score`)) %>%
arrange(`gene-score`) %>% dplyr::select(-ensembl_gene_id) %>%
filter(!duplicated(`gene-symbol`), !`gene-symbol` %in% genes_with_copies$`gene-symbol`)
kable(df %>% dplyr::select(ID, `gene-symbol`, `gene-score`, syndromic, `number-of-reports`),
caption='Lost Genes with SFARI Scores')
| ID | gene-symbol | gene-score | syndromic | number-of-reports |
|---|---|---|---|---|
| ENSG00000235718 | MFRP | 3 | 0 | 6 |
| ENSG00000167014 | TERB2 | 4 | 0 | 1 |
| ENSG00000122728 | TAF1L | 6 | 0 | 3 |
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
cat(paste0(length(unique(SFARI_genes$`gene-symbol`[SFARI_genes$ID %in% rownames(datExpr)])), ' SFARI genes remaining'))
## 971 SFARI genes remaining
rm(df, genes_with_copies)
\(\qquad\) 2.2 Removing genes with a high percentage of zeros
Choosing the threshold:
Criteria for selecting the percentage of zeros threshold: The minimum value in which the preprocessed data is relatively homoscedastic (we’re trying to get rid of the group of genes with very low mean and SD that make the cloud of points look like a comic book speech bubble)
On the plot I’m using the “dual” of the maximum percentage of zeros, the minimum percentage of non zeros so the visualisation is more intuitive
75% seems to be a good threshold for the minimum percentage of non zeros, so 25% will be the maximum percentage of zeros allowed in a row
The Mean vs SD plot doesn’t show all of the genes, a random sample was selected for the genes with higher level of expression so the visualisation wouldn’t be as heavy (and since we care about the genes with the lowest levels of expression, we aren’t losing important information)
datMeta_original = datMeta
datExpr_original = datExpr
datGenes_original = datGenes
# Return to original variables
datExpr = datExpr_original
datGenes = datGenes_original
datMeta = datMeta_original
rm(datExpr_original, datGenes_original, datMeta_original, datExpr_vst, datGenes_vst, datMeta_vst)
Filtering
# Minimum percentage of non-zero entries allowed per gene
threshold = 80
plot_data = data.frame('id'=rownames(datExpr), 'non_zero_percentage' = apply(datExpr, 1, function(x) 100*mean(x>0)))
ggplotly(plot_data %>% ggplot(aes(x=non_zero_percentage)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
geom_vline(xintercept=threshold, color='gray') + #scale_x_log10() +
ggtitle('Percentage of non-zero entries distribution') + theme_minimal())
to_keep = apply(datExpr, 1, function(x) 100*mean(x>0)) >= threshold
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
cat(paste0(length(unique(SFARI_genes$`gene-symbol`[SFARI_genes$ID %in% rownames(datExpr)])), ' SFARI genes remaining'))
## 828 SFARI genes remaining
cat(paste0('Removed ', sum(!to_keep), ' genes, ', sum(to_keep), ' remaining'))
## Removed 5606 genes, 13698 remaining
rm(threshold, plot_data, to_keep)
3. Filter outlier samples
Using node connectivity as a distance measure, normalising it and filtering out genes farther away than 2 standard deviations from the left (lower connectivity than average, not higher)
Gandal uses the formula \(s_{ij}=\frac{1+bw(i,j)}{2}\) to convert all the weights to positive values, but I used \(s_{ij}=|bw(i,j)|\) instead because I think it makes more sense. In the end it doesn’t matter because they select as outliers the same six samples
Only 2 outliers, both with ASD Diagnosis
absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))
plot_data = data.frame('sample'=1:length(z.ku), 'distance'=z.ku, 'Sample_ID'=datMeta$ID,
'Subject_ID'=datMeta$Subject_ID, 'Site'=datMeta$SiteHM,
'Brain_Lobe'=datMeta$brain_lobe, 'Sex'=datMeta$Gender, 'Age'=datMeta$Age,
'Diagnosis'=datMeta$Diagnosis, 'PMI'=as.numeric(datMeta$PMI))
selectable_scatter_plot(plot_data, plot_data[,-c(1,2)])
cat(paste0('Outlier samples: ', paste(as.character(plot_data$Sample_ID[plot_data$distance< -2]), collapse=', ')))
## Outlier samples: ba19.s13, ba44.s23
to_keep = z.ku > -2
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
cat(paste0('Removed ', sum(!to_keep), ' samples, ', sum(to_keep), ' remaining'))
## Removed 2 samples, 106 remaining
rm(absadj, netsummary, ku, z.ku, plot_data, to_keep)
cat(paste0('After filtering, the dataset consists of ', nrow(datExpr), ' genes and ', ncol(datExpr), ' samples'))
## After filtering, the dataset consists of 13698 genes and 106 samples
According to Tackling the widespread and critical impact of batch effects in high-throughput data, technical artifacts can be an important source of variability in the data, so batch correction should be part of the standard preprocessing pipeline of gene expression data.
They say Processing group and Date of the experiment are good batch surrogates, I only have processing group, so I’m going to see if this affects the data in any clear way to use it as a surrogate.
All the information we have is the Brain Bank (H/M), and although all the samples were obtained from the Autism Tissue Project, we don’t have any more specific information about who preprocessed each sample
table(datMeta$SiteHM)
##
## H M
## 49 57
There seems to be an important bias between the site that processed the samples and the objective variable, so the batch effect can be confused with the diagnosis effect.
table(datMeta$SiteHM, datMeta$Diagnosis)
##
## CTL ASD
## H 13 36
## M 45 12
Samples don’t seem to cluster together that strongly for each batch, although there does seem to be some kind of relation, but it could be due to diagnosis, not to batch (this is the problem with unbalanced diagnosis between batches!)
h_clusts = datExpr %>% t %>% dist %>% hclust %>% as.dendrogram
create_viridis_dict = function(){
min_age = datMeta$Age %>% min
max_age = datMeta$Age %>% max
viridis_age_cols = viridis(max_age - min_age + 1)
names(viridis_age_cols) = seq(min_age, max_age)
return(viridis_age_cols)
}
viridis_age_cols = create_viridis_dict()
dend_meta = datMeta[match(labels(h_clusts), datMeta$ID),] %>%
mutate('Site' = ifelse(SiteHM=='H', '#F8766D', '#00BFC4'),
'Diagnosis' = ifelse(Diagnosis=='CTL','#008080','#86b300'), # Blue control, Green ASD
'Sex' = ifelse(Gender=='F','#ff6666','#008ae6'), # Pink Female, Blue Male
'Region' = case_when(brain_lobe=='Frontal'~'#F8766D', # ggplot defaults for 2 colours
brain_lobe=='Occipital'~'#00BFC4'),
'Age' = viridis_age_cols[as.character(Age)]) %>% # Purple: young, Yellow: old
dplyr::select(Age, Region, Sex, Diagnosis, Site)
h_clusts %>% dendextend::set('labels', rep('', nrow(datMeta))) %>% dendextend::set('branches_k_color', k=9) %>% plot
colored_bars(colors=dend_meta)
rm(h_clusts, dend_meta, create_viridis_dict, viridis_age_cols)
Comparing the mean expression of each sample by batch we can see there is some batch effect differentiating them, but it could be because of the imbalance in Diagnosis by batches, since in the exploratory analysis we can see that ASD samples have a higher general level of expression compared to the CTL group (see 20_03_31_exploratory_analysis.html)
plot_data_b1 = data.frame('Mean'=colMeans(datExpr[,datMeta$SiteHM=='H']), 'Batch'='H')
plot_data_b2 = data.frame('Mean'=colMeans(datExpr[,datMeta$SiteHM=='M']), 'Batch'='M')
plot_data = rbind(plot_data_b1, plot_data_b2)
mu = plot_data %>% group_by(Batch) %>% dplyr::summarise(BatchMean=mean(Mean))
ggplotly(plot_data %>% ggplot(aes(x=Mean, color=Batch, fill=Batch)) + geom_density(alpha=0.3) +
geom_vline(data=mu, aes(xintercept=BatchMean, color=Batch), linetype='dashed') +
ggtitle('Mean expression by sample grouped by Batch') + scale_x_log10() + theme_minimal())
rm(plot_data_b1, plot_data_b2, plot_data, mu)
Following the pipeline from Surrogate variable analysis: hidden batch effects where sva is used with DESeq2.
Create a DeseqDataSet object, estimate the library size correction and save the normalized counts matrix
counts = datExpr %>% as.matrix
rowRanges = GRanges(datGenes$chromosome_name,
IRanges(datGenes$start_position, width=datGenes$length),
strand=datGenes$strand,
feature_id=datGenes$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
dds = DESeqDataSet(se, design =~Diagnosis)
## converting counts to integer mode
dds = estimateSizeFactors(dds)
norm.cts = counts(dds, normalized=TRUE)
Provide the normalized counts and two model matrices to SVA. The first matrix uses the biological condition, and the second model matrix is the null model.
mod = model.matrix(~ Diagnosis, colData(dds))
mod0 = model.matrix(~ 1, colData(dds))
sva_fit = svaseq(norm.cts, mod=mod, mod0=mod0)
## Number of significant surrogate variables is: 23
## Iteration (out of 5 ):1 2 3 4 5
rm(mod, mod0, norm.cts)
Found 23 surrogate variables, since there is no direct way to select which ones to pick Bioconductor answer, decided to keep all of them.
Include SV estimations to datMeta information
sv_data = sva_fit$sv %>% data.frame
colnames(sv_data) = paste0('SV',1:ncol(sv_data))
datMeta_sva = cbind(datMeta, sv_data)
rm(sv_data, sva_fit)
In conclusion: Site could work as a surrogate for batch effects, but has the HUGE downside that is correlated to Diagnosis. The sva package found other 23 variables that could work as surrogates which are now included in datMeta and should be included in the DEA.
Using DESeq2 package to perform normalisation. Chose this package over limma because limma uses the log transformed data as input instead of the raw counts and I have discovered that in this dataset, this transformation affects genes differently depending on their mean expression level, and genes with a high SFARI score are specially affected by this.
plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr), 'SD'=apply(datExpr,1,sd))
plot_data %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.1) + geom_abline(color='gray') +
scale_x_log10() + scale_y_log10() + theme_minimal()
rm(plot_data)
Using vst instead of rlog to perform normalisation. Bioconductor question explaining differences between methods. Chose vst because a) it is much faster than rlog (it is recommended to use vst for samples larger than 50), and b) Michael Love (author of DESEq2) recommends using it over rlog
Including a log fold change threshold of 0 in the results formula \(H_0:lfc=0\) because setting any other log fold change seems arbitrary and we risk losing genes with a significant differential expression for genes with a higher difference, but not necessarily as significant.
counts = datExpr %>% as.matrix
rowRanges = GRanges(datGenes$chromosome_name,
IRanges(datGenes$start_position, width=datGenes$length),
strand=datGenes$strand,
feature_id=datGenes$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta_sva)
dds = DESeqDataSet(se, design = ~ SiteHM + SV1 + SV2 + SV3 + SV4 + SV5 + SV6 + SV7 + SV8 + SV9 +
SV10 + SV11 + SV12 + SV13 + SV14 + SV15 + SV16 + SV17 + SV18 +
SV19 + SV20 + SV21 + SV22 + SV23 + Diagnosis)
## converting counts to integer mode
# Perform DEA
#dds = DESeq(dds) # Changed this for the three lines below because some rows don't converge
dds = estimateSizeFactors(dds)
dds = estimateDispersions(dds)
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
dds = nbinomWaldTest(dds, maxit=10000)
## 76 rows did not converge in beta, labelled in mcols(object)$betaConv. Use larger maxit argument with nbinomWaldTest
DE_info = results(dds, lfcThreshold=0, altHypothesis='greaterAbs')
# Perform vst
vsd = vst(dds)
datExpr_vst = assay(vsd)
datMeta_vst = colData(vsd)
datGenes_vst = rowRanges(vsd)
rm(counts, rowRanges, se, vsd)
Using the plotting function DESEq2’s manual proposes to study vst’s output it looks like the data could be homoscedastic
meanSdPlot(datExpr_vst, plot=FALSE)$gg + theme_minimal()
When plotting point by point it seems like the genes with the lowest values behave differently
plot_data = data.frame('ID'=rownames(datExpr_vst), 'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))
plot_data %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.2) +
scale_x_log10() + scale_y_log10() + theme_minimal()
rm(plot_data)
*Could have done this since before
save(datExpr, datMeta, datGenes, file='./../Data/filtered_raw_data.RData')
#load('./../Data/Gandal/filtered_raw_data.RData')
Rename normalised datasets to continue working with these
datExpr = datExpr_vst
datMeta = datMeta_vst %>% data.frame
datGenes = datGenes_vst
print(paste0(length(unique(SFARI_genes$`gene-symbol`[SFARI_genes$ID %in% rownames(datExpr)])), ' SFARI genes remaining'))
## [1] "828 SFARI genes remaining"
print(paste0('After filtering, the dataset consists of ', nrow(datExpr), ' genes and ', ncol(datExpr), ' samples'))
## [1] "After filtering, the dataset consists of 13698 genes and 106 samples"
rm(datExpr_vst, datMeta_vst, datGenes_vst, datMeta_sva)
By including the surrogate variables in the DESeq formula we only modelled the batch effects into the DEA, but we didn’t actually correct them from the data, for that we need to use ComBat (or other equivalent package) in the already normalised data
In some places they say you shouldn’t correct these effects on the data because you risk losing biological variation, in others they say you should because they introduce noise to the data. The only thing everyone agrees on is that you shouldn’t remove them before performing DEA but instead include them in the model.
Based on the conclusions from Practical impacts of genomic data “cleaning” on biological discovery using surrogate variable analysis it seems like it may be a good idea to remove the batch effects from the data and not only from the DE analysis:
Using SVA, ComBat or related tools can increase the power to identify specific signals in complex genomic datasets (they found “greatly sharpened global and gene-specific differential expression across treatment groups”)
But caution should be exercised to avoid removing biological signal of interest
We must be precise and deliberate in the design and analysis of experiments and the resulting data, and also mindful of the limitations we impose with our own perspective
Open data exploration is not possible after such supervised “cleaning”, because effects beyond those stipulated by the researcher may have been removed
# Taken from https://www.biostars.org/p/121489/#121500
correctDatExpr = function(datExpr, mod, svs) {
X = cbind(mod, svs)
Hat = solve(t(X) %*% X) %*% t(X)
beta = (Hat %*% t(datExpr))
rm(Hat)
gc()
P = ncol(mod)
return(datExpr - t(as.matrix(X[,-c(1:P)]) %*% beta[-c(1:P),]))
}
pca_samples_before = datExpr %>% t %>% prcomp
pca_genes_before = datExpr %>% prcomp
# Correct
mod = model.matrix(~ Diagnosis, colData(dds))
svs = datMeta %>% dplyr::select(SV1:SV23) %>% as.matrix
datExpr_corrected = correctDatExpr(as.matrix(datExpr), mod, svs)
pca_samples_after = datExpr_corrected %>% t %>% prcomp
pca_genes_after = datExpr_corrected %>% prcomp
rm(correctDatExpr)
Removing batch effects has a big impact in the distribution of the samples, separating them by diagnosis relatively well just using the first principal component (although the separation is not nearly as good as with the Gandal dataset)
pca_samples_df = rbind(data.frame('ID'=colnames(datExpr), 'PC1'=pca_samples_before$x[,1],
'PC2'=pca_samples_before$x[,2], 'corrected'=0),
data.frame('ID'=colnames(datExpr), 'PC1'=-pca_samples_after$x[,1],
'PC2'=-pca_samples_after$x[,2], 'corrected'=1)) %>%
left_join(datMeta %>% mutate('ID'=rownames(datMeta)), by='ID')
ggplotly(pca_samples_df %>% ggplot(aes(PC1, PC2, color=Diagnosis)) + geom_point(aes(frame=corrected, id=ID), alpha=0.75) +
xlab(paste0('PC1 (corr=', round(cor(pca_samples_before$x[,1],pca_samples_after$x[,1]),2),
'). % Var explained: ', round(100*summary(pca_samples_before)$importance[2,1],1),' to ',
round(100*summary(pca_samples_after)$importance[2,1],1))) +
ylab(paste0('PC2 (corr=', round(cor(pca_samples_before$x[,2],pca_samples_after$x[,2]),2),
'). % Var explained: ', round(100*summary(pca_samples_before)$importance[2,2],1),' to ',
round(100*summary(pca_samples_after)$importance[2,2],1))) +
ggtitle('Samples') + theme_minimal())
rm(pca_samples_df)
It seems like the sva correction preserves the mean expression of the genes and erases almost everything else (although what little else remains is enough to characterise the two Diagnosis groups relatively well using only the first PC)
*Plot is done with only 10% of the genes because it was too heavy otherwise
pca_genes_df = rbind(data.frame('ID'=rownames(datExpr), 'PC1'=pca_genes_before$x[,1],
'PC2'=pca_genes_before$x[,2], 'corrected'=0, 'MeanExpr'=rowMeans(datExpr)),
data.frame('ID'=rownames(datExpr), 'PC1'=pca_genes_after$x[,1],
'PC2'=-pca_genes_after$x[,2], 'corrected'=1, 'MeanExpr'=rowMeans(datExpr)))
keep_genes = rownames(datExpr) %>% sample(0.1*nrow(datExpr))
pca_genes_df = pca_genes_df %>% filter(ID %in% keep_genes)
ggplotly(pca_genes_df %>% ggplot(aes(PC1, PC2,color=MeanExpr)) + geom_point(alpha=0.3, aes(frame=corrected, id=ID)) +
xlab(paste0('PC1 (corr=', round(cor(pca_genes_before$x[,1],pca_genes_after$x[,1]),2),
'). % Var explained: ', round(100*summary(pca_genes_before)$importance[2,1],1),' to ',
round(100*summary(pca_genes_after)$importance[2,1],1))) +
ylab(paste0('PC2 (corr=', round(cor(pca_genes_before$x[,2],pca_genes_after$x[,2]),2),
'). % Var explained: ', round(100*summary(pca_genes_before)$importance[2,2],1),' to ',
round(100*summary(pca_genes_after)$importance[2,2],1))) +
scale_color_viridis() + ggtitle('Genes') + theme_minimal())
rm(pca_samples_before, pca_genes_before, mod, svs, pca_samples_after, pca_genes_after, pca_genes_df, keep_genes)
Decided to keep the corrected expression dataset
datExpr = datExpr_corrected
rm(datExpr_corrected)
Even after correcting the dataset for the surrogate variables found with sva, there is still a difference in mean expression by processing site. The problem is that processing site is correlated with Diagnosis, so by correcting it we risk be erasing relevant information related to ASD
plot_data_b1 = data.frame('Mean'=colMeans(datExpr[,datMeta$SiteHM=='H']), 'Batch'='H')
plot_data_b2 = data.frame('Mean'=colMeans(datExpr[,datMeta$SiteHM=='M']), 'Batch'='M')
plot_data = rbind(plot_data_b1, plot_data_b2)
mu = plot_data %>% group_by(Batch) %>% dplyr::summarise(BatchMean=mean(Mean))
ggplotly(plot_data %>% ggplot(aes(x=Mean, color=Batch, fill=Batch)) + geom_density(alpha=0.3) +
geom_vline(data=mu, aes(xintercept=BatchMean, color=Batch), linetype='dashed') +
ggtitle('Mean expression by sample grouped by processing date') + scale_x_log10() + theme_minimal())
rm(plot_data_b1, plot_data_b2, plot_data, mu)
I will save the batch corrected dataset as a different dataset because of the correlation between processing site and diagnosis
https://support.bioconductor.org/p/50983/
datExpr_combat = datExpr %>% as.matrix %>% ComBat(batch=datMeta$SiteHM)
## Found2batches
## Adjusting for0covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
Now both batches have almost the same mean expression (we’d have to see what effect this has on the Diagnosis variable)
plot_data_b1 = data.frame('Mean'=colMeans(datExpr_combat[,datMeta$SiteHM=='H']), 'Batch'='H')
plot_data_b2 = data.frame('Mean'=colMeans(datExpr_combat[,datMeta$SiteHM=='M']), 'Batch'='M')
plot_data = rbind(plot_data_b1, plot_data_b2)
mu = plot_data %>% group_by(Batch) %>% dplyr::summarise(BatchMean=mean(Mean))
ggplotly(plot_data %>% ggplot(aes(x=Mean, color=Batch, fill=Batch)) + geom_density(alpha=0.3) +
geom_vline(data=mu, aes(xintercept=BatchMean, color=Batch), linetype='dashed') +
ggtitle('Mean expression by sample grouped by processing date') + scale_x_log10() + theme_minimal())
rm(plot_data_b1, plot_data_b2, plot_data, mu)
save(datExpr, datMeta, datGenes, DE_info, dds, file='./../Data/preprocessed_data.RData')
save(datExpr_combat, datMeta, datGenes, DE_info, dds, file='./../Data/preprocessed_data_ComBat.RData')
#load('./../Data/Gandal/preprocessed_data.RData')
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] knitr_1.28 expss_0.10.2
## [3] dendextend_1.13.4 vsn_3.52.0
## [5] WGCNA_1.69 fastcluster_1.1.25
## [7] dynamicTreeCut_1.63-1 sva_3.32.1
## [9] genefilter_1.66.0 mgcv_1.8-31
## [11] nlme_3.1-144 DESeq2_1.24.0
## [13] SummarizedExperiment_1.14.1 DelayedArray_0.10.0
## [15] BiocParallel_1.18.1 matrixStats_0.56.0
## [17] Biobase_2.44.0 GenomicRanges_1.36.1
## [19] GenomeInfoDb_1.20.0 IRanges_2.18.3
## [21] S4Vectors_0.22.1 BiocGenerics_0.30.0
## [23] biomaRt_2.40.5 ggpubr_0.2.5
## [25] magrittr_1.5 ggExtra_0.9
## [27] GGally_1.5.0 gridExtra_2.3
## [29] viridis_0.5.1 viridisLite_0.3.0
## [31] RColorBrewer_1.1-2 plotlyutils_0.0.0.9000
## [33] plotly_4.9.2 glue_1.3.2
## [35] reshape2_1.4.3 forcats_0.5.0
## [37] stringr_1.4.0 dplyr_0.8.5
## [39] purrr_0.3.3 readr_1.3.1
## [41] tidyr_1.0.2 tibble_3.0.0
## [43] ggplot2_3.3.0 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.1.5 Hmisc_4.4-0
## [4] plyr_1.8.6 lazyeval_0.2.2 splines_3.6.3
## [7] crosstalk_1.1.0.1 digest_0.6.25 foreach_1.5.0
## [10] htmltools_0.4.0 GO.db_3.8.2 fansi_0.4.1
## [13] checkmate_2.0.0 memoise_1.1.0 doParallel_1.0.15
## [16] cluster_2.1.0 limma_3.40.6 annotate_1.62.0
## [19] modelr_0.1.6 prettyunits_1.1.1 jpeg_0.1-8.1
## [22] colorspace_1.4-1 blob_1.2.1 rvest_0.3.5
## [25] haven_2.2.0 xfun_0.12 hexbin_1.28.1
## [28] crayon_1.3.4 RCurl_1.98-1.1 jsonlite_1.6.1
## [31] impute_1.58.0 iterators_1.0.12 survival_3.1-11
## [34] gtable_0.3.0 zlibbioc_1.30.0 XVector_0.24.0
## [37] scales_1.1.0 DBI_1.1.0 miniUI_0.1.1.1
## [40] Rcpp_1.0.4 xtable_1.8-4 progress_1.2.2
## [43] htmlTable_1.13.3 foreign_0.8-75 bit_1.1-15.2
## [46] preprocessCore_1.46.0 Formula_1.2-3 htmlwidgets_1.5.1
## [49] httr_1.4.1 acepack_1.4.1 ellipsis_0.3.0
## [52] farver_2.0.3 pkgconfig_2.0.3 reshape_0.8.8
## [55] XML_3.99-0.3 nnet_7.3-13 dbplyr_1.4.2
## [58] locfit_1.5-9.4 labeling_0.3 tidyselect_1.0.0
## [61] rlang_0.4.5 later_1.0.0 AnnotationDbi_1.46.1
## [64] munsell_0.5.0 cellranger_1.1.0 tools_3.6.3
## [67] cli_2.0.2 generics_0.0.2 RSQLite_2.2.0
## [70] broom_0.5.5 evaluate_0.14 fastmap_1.0.1
## [73] yaml_2.2.1 bit64_0.9-7 fs_1.4.0
## [76] mime_0.9 xml2_1.2.5 compiler_3.6.3
## [79] rstudioapi_0.11 curl_4.3 png_0.1-7
## [82] affyio_1.54.0 ggsignif_0.6.0 reprex_0.3.0
## [85] geneplotter_1.62.0 stringi_1.4.6 highr_0.8
## [88] lattice_0.20-40 Matrix_1.2-18 vctrs_0.2.4
## [91] pillar_1.4.3 lifecycle_0.2.0 BiocManager_1.30.10
## [94] cowplot_1.0.0 data.table_1.12.8 bitops_1.0-6
## [97] httpuv_1.5.2 affy_1.62.0 R6_2.4.1
## [100] latticeExtra_0.6-29 promises_1.1.0 codetools_0.2-16
## [103] assertthat_0.2.1 withr_2.1.2 GenomeInfoDbData_1.2.1
## [106] hms_0.5.3 grid_3.6.3 rpart_4.1-15
## [109] rmarkdown_2.1 shiny_1.4.0.2 lubridate_1.7.4
## [112] base64enc_0.1-3